options(java.parameters = "-Xmx5g")
library(tidyverse)
library(haven)
library(grf)
library(glmnet)
library(bartMachine)
library(ggplot2)
library(ggthemes)
library(evalITR)
library(Cairo)

trainset=read_csv("limited_training_set.csv",guess_max = 10000)


###PAPE Evaluation
set.seed(2019)
plim=0.2
output = matrix(0, nrow = 9, ncol = 9)
for (i in 1:3) {
## Causal Forests
trainset[,i]=trainset[,i]
Y_train=as.numeric(unlist(trainset[,i]))
X_train=trainset%>%select(-c(stdntid,schid,g3treadss,g3tmathss,g3tlangss,classsize))
X_train$race[X_train$race>2]=3
X_train$gender=as.factor(X_train$gender)
X_train$race=as.factor(X_train$race)
#X_train$birthmonth=as.factor(X_train$birthmonth)
X_train$birthyear=as.factor(X_train$birthyear)
X_train$SCHLURBN=as.factor(X_train$SCHLURBN)
X_train$GRDRANGE=as.factor(X_train$GRDRANGE)
X_train_expand=model.matrix(~.-1,data=X_train)
T_train=-trainset$classsize+2
tau.forest<-causal_forest(X_train_expand,Y_train,T_train,ci.group.size = 1,sample.fraction = 0.8,num.trees = 2000)
tau_train<-predict(tau.forest)

X=cbind(X_train,T=T_train)

## BART Trees
barc1<-bartMachine(X=X,y=Y_train,num_trees = 30)
X0=cbind(X_train,T=0)
X1=cbind(X_train,T=1)
Y0<-predict(barc1,X0)
Y1<-predict(barc1,X1)
tau_train2<-Y1-Y0



## LASSO Model
X_expand<-model.matrix(~.*T,data=X)
ls1<-glmnet(X_expand,Y_train,alpha=1,lambda=0.05)




## Testing Data
testset=read_csv("limited_testing_set.csv")
testset[,i]=testset[,i]

Y_test=as.numeric(unlist(testset[,i]))
X_test=testset%>%select(-c(stdntid,schid,g3treadss,g3tmathss,g3tlangss,classsize))
X_test$race[X_test$race>2]=3
X_test$gender=as.factor(X_test$gender)
X_test$race=as.factor(X_test$race)
#X_test$birthmonth=as.factor(X_test$birthmonth)
X_test$birthyear=as.factor(X_test$birthyear)
X_test$SCHLURBN=as.factor(X_test$SCHLURBN)
X_test$GRDRANGE=as.factor(X_test$GRDRANGE)
X_test_expand=model.matrix(~.-1,data=X_test)
T_test=-testset$classsize+2


X0t<-cbind(X_test,T=0)
X1t<-cbind(X_test,T=1)
X0t_expand=model.matrix(~.*T,data=X0t)
X1t_expand=model.matrix(~.*T,data=X1t)

## Causal Forest
tau_test1<-predict(tau.forest,X_test_expand)
That1=as.numeric(tau_test1>0)
## BART
Y0t<-predict(barc1,X0t)
Y1t<-predict(barc1,X1t)
tau_test2<-Y1t-Y0t
That2=as.numeric(tau_test2>0)
## LASSO
Y0t1<-predict(ls1,X0t_expand)
Y1t1<-predict(ls1,X1t_expand)
That3=as.numeric(Y1t1-Y0t1>0)

### PAPE Evaluation
T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
pape1=PAPE(T,That1,Y)
output[i,1]=pape1$pape
output[i,2]=pape1$sd
output[i,3]=mean(That1)

T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
pape2=PAPE(T,That2,Y)
output[i,4]=pape2$pape
output[i,5]=pape2$sd
output[i,6]=mean(That2)


T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
pape3=PAPE(T,That3,Y)
output[i,7]=pape3$pape
output[i,8]=pape3$sd
output[i,9]=mean(That3)

## Causal Forest
tau_test1<-predict(tau.forest,X_test_expand)
That1p = numeric(nrow(testset))
That1p[sort(tau_test1$predictions,decreasing = TRUE,index.return=TRUE)$ix[1:(floor(plim*length(That1))+1)]] = 1
## BART
Y0t<-predict(barc1,X0t)
Y1t<-predict(barc1,X1t)
tau_test2<-Y1t-Y0t
That2p = numeric(nrow(testset))
That2p[sort(tau_test2,decreasing = TRUE,index.return=TRUE)$ix[1:(floor(plim*length(That2))+1)]] = 1
## LASSO
Y0t1<-predict(ls1,X0t_expand)
Y1t1<-predict(ls1,X1t_expand)
tau_test3<-Y1t1-Y0t1
That3p = numeric(nrow(testset))
That3p[sort(tau_test3,decreasing = TRUE,index.return=TRUE)$ix[1:(floor(plim*length(That3))+1)]] = 1

### PAPEp Evaluation
T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
papep1=PAPE(T,That1p,Y,plim)
output[i+3,1]=papep1$pape
output[i+3,2]=papep1$sd
output[i+3,3]=mean(That1p)

T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
papep2=PAPE(T,That2p,Y,plim)
output[i+3,4]=papep2$pape
output[i+3,5]=papep2$sd
output[i+3,6]=mean(That2p)

T=-testset$classsize+2
Y=as.numeric(unlist(testset[,i]))
papep3=PAPE(T,That3p,Y,plim)
output[i+3,7]=papep3$pape
output[i+3,8]=papep3$sd
output[i+3,9]=mean(That3p)


### PAPD Evaluation

papdCFBART=PAPD(T,That1p,That2p,Y,plim)
papdBARTLASSO=PAPD(T,That2p,That3p,Y,plim)
papdCFLASSO=PAPD(T,That1p,That3p,Y,plim)



output[i+6,1]=papdCFBART$papd
output[i+6,2]=papdCFBART$sd
output[i+6,3]=0
output[i+6,4]=papdBARTLASSO$papd
output[i+6,5]=papdBARTLASSO$sd
output[i+6,6]=0
output[i+6,7]=papdCFLASSO$papd
output[i+6,8]=papdCFLASSO$sd
output[i+6,9]=0
write.csv(tau_test1,paste0(c("fixed_CF_" ,toString(i), ".csv"), collapse = ""))
write.csv(tau_test2,paste0(c("fixed_BART_" ,toString(i), ".csv"), collapse = ""))
write.csv(tau_test3,paste0(c("fixed_LASSO_" ,toString(i), ".csv"), collapse = ""))

if (i==3) {
  ### PAPE Evaluation
  T=-testset$classsize+2
  Y=as.numeric(unlist(testset[,i]))
  aupec1=AUPEC(T,tau_test1$predictions,Y)
  outputdf1 = data.frame(type = rep("Causal Forest",length(aupec1)), fraction = seq(1,length(Y))/length(Y), aupec = aupec1$vec+mean(Y))
  
  aupec2=AUPEC(T,tau_test2,Y)
  outputdf2 = data.frame(type = rep("BART",length(aupec2)), fraction = seq(1,length(Y))/length(Y), aupec = aupec2$vec+mean(Y))
  
  aupec3=AUPEC(T,tau_test3,Y)
  outputdf3 = data.frame(type = rep("LASSO",length(aupec3)), fraction = seq(1,length(Y))/length(Y), aupec = aupec3$vec+mean(Y))

  outputdf=do.call("rbind", list(outputdf1, outputdf2, outputdf3))

  graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
                            Pval = c(paste0(c("AUPEC = ", toString(round(aupec2$aupec,digits = 2))," (s.e. = ", toString(round(aupec2$sd,digits = 2)), ")"), collapse=""),paste0(c("AUPEC = ", toString(round(aupec1$aupec,digits = 2))," (s.e. = ", toString(round(aupec1$sd,digits = 2)), ")"), collapse=""), paste0(c("AUPEC = ", toString(round(aupec3$aupec,digits = 2))," (s.e. = ", toString(round(aupec3$sd,digits = 2)), ")"), collapse="")))
  outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
  outputdf$AUPECmin = 0
  outputdf$AUPECmax = 0
  outputdf$AUPECmin[outputdf$type == "Causal Forest"] =outputdf$aupec[outputdf$type == "Causal Forest"]-pape1$sd*1.96
  outputdf$AUPECmin[outputdf$type == "BART"] =outputdf$aupec[outputdf$type == "BART"]-pape2$sd*1.96
  outputdf$AUPECmin[outputdf$type == "LASSO"] =outputdf$aupec[outputdf$type == "LASSO"]-pape3$sd*1.96
  
  outputdf$AUPECmax[outputdf$type == "Causal Forest"] =outputdf$aupec[outputdf$type == "Causal Forest"]+pape1$sd*1.96
  outputdf$AUPECmax[outputdf$type == "BART"] =outputdf$aupec[outputdf$type == "BART"]+pape2$sd*1.96
  outputdf$AUPECmax[outputdf$type == "LASSO"] =outputdf$aupec[outputdf$type == "LASSO"]+pape3$sd*1.96
  CairoPDF("AUPEC.pdf",width=10,height=4)
  show(ggplot(outputdf,aes(x=fraction,y=aupec,group=type)) + geom_line(alpha=0.5,colour="red") + scale_colour_few("Dark")+
    xlab("Maximum Proportion Treated")+ylab("Average SAT Writing Score")+facet_wrap(~type)+scale_x_continuous(labels=scales::percent)+
    theme_few()+ geom_ribbon(aes(ymin=AUPECmin, ymax=AUPECmax),fill="tomato1",alpha=0.2) + 
    geom_abline(intercept = sum(Y*(1-T))/sum(1-T), slope = sum(Y*T)/sum(T)-sum(Y*(1-T))/sum(1-T),size=0.5) +
    geom_text(data = graphLabels, aes(x = 0.35, y = max(outputdf$AUPECmax)+5, label = Pval),size=3.8) +
    theme( text = element_text(size=13.5),
           axis.text = element_text(size=10),
           strip.text = element_text(size = 13.5)))
  dev.off()
  write.csv(outputdf,"fixed_output_aupec.csv")
}
}
write.csv(output,"fixed_output.csv")




# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,1]))
# outputdf=read_csv("outputdf2.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#            Pval = c("AUPEC = 10.83 (s.e. = 3.92)","AUPEC = 15.34 (s.e. = 4.08)", "AUPEC = 7.18 (s.e. = 4.04)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# CairoPDF("AUPEC2.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=aupec)) + geom_line(alpha=0.5,colour="red") + scale_colour_few("dark")+
#   xlab("Maximum Proportion Treated")+ylab("Average SAT Reading Score")+facet_wrap(~type)+scale_x_continuous(labels=percent)+
#   theme_few()+ 
#   geom_abline(intercept = sum(Y2*(1-T2))/sum(1-T2), slope = sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2),size=0.5) +
#   geom_text(data = graphLabels, aes(x = 0.35, y = 700, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()
# 
# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,3]))
# outputdf=read_csv("outputdf1.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#                           Pval = c("AUPEC = 17.20 (s.e. = 4.17)","AUPEC = 19.10 (s.e. = 4.23)", "AUPEC = 7.93 (s.e. = 4.16)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# CairoPDF("AUPEC1.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=aupec)) + geom_line(alpha=0.5,colour="red") + scale_colour_few("dark")+
#   xlab("Maximum Proportion Treated")+ylab("Average SAT Writing Score")+facet_wrap(~type)+scale_x_continuous(labels=percent)+
#   theme_few()+ 
#   geom_abline(intercept = sum(Y2*(1-T2))/sum(1-T2), slope = sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2),size=0.5) +
#   geom_text(data = graphLabels, aes(x = 0.35, y = 720, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()
# 
# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,2]))
# outputdf=read_csv("outputdf.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#                           Pval = c("AUPEC = 23.60 (s.e. = 4.02)","AUPEC = 20.00 (s.e. = 4.14)", "AUPEC = -6.20 (s.e. = 4.13)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# outputdf$AUPECmin=outputdf$aupec-4.65^2*1.96
# outputdf$AUPECmax=outputdf$aupec+4.65^2*1.96
# CairoPDF("AUPEC.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=aupec)) + geom_line(alpha=0.5,colour="red") + scale_colour_few("Dark")+
#   xlab("Maximum Proportion Treated")+ylab("Average SAT Math Score")+facet_wrap(~type)+scale_x_continuous(labels=scales::percent)+
#   theme_few()+ geom_ribbon(aes(ymin=AUPECmin, ymax=AUPECmax),fill="tomato1",alpha=0.2) + 
#   geom_abline(intercept = sum(Y2*(1-T2))/sum(1-T2), slope = sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2),size=0.5) +
#   geom_text(data = graphLabels, aes(x = 0.35, y = 720, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()
# 
# 
# 
# 
# 
# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,1]))
# outputdf=read_csv("outputdf2.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#                           Pval = c("AUPEC = 10.83 (s.e. = 3.92)","AUPEC = 15.34 (s.e. = 4.08)", "AUPEC = 7.18 (s.e. = 4.04)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# outputdf$AUPECt=outputdf$aupec - (sum(Y2*(1-T2))/sum(1-T2) + outputdf$fraction*(sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2)))
# outputdf$AUPECtmin=outputdf$AUPECt-4.65*1.96
# outputdf$AUPECtmax=outputdf$AUPECt+4.65*1.96
# 
# CairoPDF("AUPECt_2.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=AUPECt)) + geom_line(alpha=0.7,colour="red2",size=0.2) + scale_colour_few("dark")+ geom_ribbon(aes(ymin=AUPECtmin, ymax=AUPECtmax),fill="tomato1",alpha=0.2)+
#   xlab("Maximum Proportion Treated")+ylab("PAPE on SAT Reading Score")+facet_wrap(~type)+scale_x_continuous(labels=percent)+geom_segment(x=0,y=0,xend=1,yend=0,size=0.5) +
#   theme_few()+  geom_text(data = graphLabels, aes(x = 0.35, y = 70, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()
# 
# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,3]))
# outputdf=read_csv("outputdf1.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#                           Pval = c("AUPEC = 17.20 (s.e. = 4.17)","AUPEC = 19.10 (s.e. = 4.23)", "AUPEC = 7.93 (s.e. = 4.16)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# outputdf$AUPECt=outputdf$aupec - (sum(Y2*(1-T2))/sum(1-T2) + outputdf$fraction*(sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2)))
# outputdf$AUPECtmin=outputdf$AUPECt-4.65*1.96
# outputdf$AUPECtmax=outputdf$AUPECt+4.65*1.96
# CairoPDF("AUPECt_1.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=AUPECt)) + geom_line(alpha=0.7,colour="red2",size=0.2) + scale_colour_few("dark")+ geom_ribbon(aes(ymin=AUPECtmin, ymax=AUPECtmax),fill="tomato1",alpha=0.2)+
#   xlab("Maximum Proportion Treated")+ylab("PAPE on SAT Writing Score")+facet_wrap(~type)+scale_x_continuous(labels=percent)+geom_segment(x=0,y=0,xend=1,yend=0,size=0.5) +
#   theme_few()+  geom_text(data = graphLabels, aes(x = 0.35, y = 75, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()
# 
# T2=-testset$classsize+2
# Y2=as.numeric(unlist(testset[,2]))
# outputdf=read_csv("outputdf.csv")
# outputdf$type[outputdf$type=="Causal Forests"]="Causal Forest"
# graphLabels <- data.frame(type = c("BART","Causal Forest","LASSO"),
#                           Pval = c("AUPEC = 23.60 (s.e. = 4.02)","AUPEC = 20.00 (s.e. = 4.14)", "AUPEC = -6.20 (s.e. = 4.13)"))
# outputdf$type=factor(outputdf$type,levels=c("BART","Causal Forest","LASSO"))
# outputdf$AUPECt=outputdf$aupec - (sum(Y2*(1-T2))/sum(1-T2) + outputdf$fraction*(sum(Y2*T2)/sum(T2)-sum(Y2*(1-T2))/sum(1-T2)))
# outputdf$AUPECtmin=outputdf$AUPECt-4.65*1.96
# outputdf$AUPECtmax=outputdf$AUPECt+4.65*1.96
# CairoPDF("AUPECt.pdf",width=10,height=4)
# ggplot(outputdf,aes(x=fraction,y=AUPECt)) + geom_line(alpha=0.7,colour="red2",size=0.2) + scale_colour_few("dark")+ geom_ribbon(aes(ymin=AUPECtmin, ymax=AUPECtmax),fill="tomato1",alpha=0.2)+
#   xlab("Maximum Proportion Treated")+ylab("PAPE on SAT Math Score")+facet_wrap(~type)+scale_x_continuous(labels=percent)+geom_segment(x=0,y=0,xend=1,yend=0,size=0.5) +
#   theme_few()+  geom_text(data = graphLabels, aes(x = 0.35, y = 75, label = Pval),size=3.8) +
#   theme( text = element_text(size=13.5),
#          axis.text = element_text(size=10),
#          strip.text = element_text(size = 13.5))
# dev.off()


# set.seed(2019)
# 
# 
# ### PAPD Example
# for (i in 1:3) {
#   ## Causal Forests
#   Y_train=as.numeric(unlist(trainset[,i]))
#   X_train=trainset%>%select(-c(stdntid,schid,g3treadss,g3tmathss,g3tlangss,classsize))
#   X_train$race[X_train$race>2]=3
#   X_train$gender=as.factor(X_train$gender)
#   X_train$race=as.factor(X_train$race)
#   #X_train$birthmonth=as.factor(X_train$birthmonth)
#   X_train$birthyear=as.factor(X_train$birthyear)
#   X_train$SCHLURBN=as.factor(X_train$SCHLURBN)
#   X_train$GRDRANGE=as.factor(X_train$GRDRANGE)
#   X_train_expand=model.matrix(~.-1,data=X_train)
#   T_train=-trainset$classsize+2
#   tau.forest<-causal_forest(X_train_expand,Y_train,T_train,ci.group.size = 1,sample.fraction = 0.9,num.trees = 2000)
#   tau_train<-predict(tau.forest)
#   
#   
#   
#   X=cbind(X_train,T=T_train)
#   
#   
#   
#   ## LASSO Model
#   X_expand<-model.matrix(~.*T,data=X)
#   ls1<-glmnet(X_expand,Y_train,alpha=1,lambda=0.05)
#   
#   
#   
#   
#   ## Testing Data
#   testset=read_csv("limited_testing_set_new.csv")
#   Y_test=as.numeric(unlist(testset[,i]))
#   X_test=testset%>%select(-c(stdntid,schid,g3treadss,g3tmathss,g3tlangss,classsize))
#   X_test$race[X_test$race>2]=3
#   X_test$gender=as.factor(X_test$gender)
#   X_test$race=as.factor(X_test$race)
#   #X_test$birthmonth=as.factor(X_test$birthmonth)
#   X_test$birthyear=as.factor(X_test$birthyear)
#   X_test$SCHLURBN=as.factor(X_test$SCHLURBN)
#   X_test$GRDRANGE=as.factor(X_test$GRDRANGE)
#   X_test_expand=model.matrix(~.-1,data=X_test)
#   T_test=-testset$classsize+2
#   
#   
#   X0t<-cbind(X_test,T=0)
#   X1t<-cbind(X_test,T=1)
#   X0t_expand=model.matrix(~.*T,data=X0t)
#   X1t_expand=model.matrix(~.*T,data=X1t)
#   
#   ## Causal Forest
#   tau_test1<-predict(tau.forest,X_test_expand)
#   That1 = numeric(nrow(testset))
#   That1[sort(tau_test1$predictions,decreasing = TRUE,index.return=TRUE)$ix[1:(round(plim*length(That1))+1)]] = 1
#   ## LASSO
#   Y0t1<-predict(ls1,X0t_expand)
#   Y1t1<-predict(ls1,X1t_expand)
#   tau_test3<-Y1t1-Y0t1
#   That3 = numeric(nrow(testset))
#   That3[sort(tau_test3,decreasing = TRUE,index.return=TRUE)$ix[1:(round(plim*length(That3))+1)]] = 1
#   
#   ### PAPD Evaluation
#   T=-testset$classsize+2
#   Y=as.numeric(unlist(testset[,i]))
#   papd1=PAPD(T,That1,That3,Y,plim)
#   print(papd1$papd)
#   print(sqrt(papd1$sd))
#   
# 
# }